/* linbox/algorithms/opencl-domain.h
 * Copyright (C) 2012 Matthew Wezowicz
 *
 * Written by Matthew Wezowicz <mwezz@udel.edu>
 *
 * ========LICENCE========
 * This file is part of the library LinBox.
 *
 * LinBox is free software: you can redistribute it and/or modify
 * it under the terms of the  GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 * ========LICENCE========
 *.
 */

 /*###---    Generated On     ---###*/
 /*###--- 07/24/2012 11:29:38 ---###*/

#ifndef __LINBOX_opencl_matrix_domain_kernels_INL
#define __LINBOX_opencl_matrix_domain_kernels_INL

namespace LinBox{

	const char* matrixMulKernelModular1DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMulKernelModular1DP(__global double* C, __global double* A, __global double* B,\n"
		"		const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Csub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Csub += As[ty][0] * Bs[0][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][1] * Bs[1][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][2] * Bs[2][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][3] * Bs[3][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][4] * Bs[4][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][5] * Bs[5][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][6] * Bs[6][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][7] * Bs[7][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][8] * Bs[8][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][9] * Bs[9][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][10] * Bs[10][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][11] * Bs[11][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][12] * Bs[12][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][13] * Bs[13][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][14] * Bs[14][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][15] * Bs[15][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	C[c + ty * widthB + tx] = Csub;\n"
		"}\n"
	};

	const char* matrixMulKernelModular1SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMulKernelModular1SP(__global float* C, __global float* A, __global float* B,\n"
		"		const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Csub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Csub += As[ty][0] * Bs[0][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][1] * Bs[1][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][2] * Bs[2][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][3] * Bs[3][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][4] * Bs[4][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][5] * Bs[5][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][6] * Bs[6][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][7] * Bs[7][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][8] * Bs[8][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][9] * Bs[9][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][10] * Bs[10][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][11] * Bs[11][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][12] * Bs[12][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][13] * Bs[13][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][14] * Bs[14][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][15] * Bs[15][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	C[c + ty * widthB + tx] = Csub;\n"
		"}\n"
	};

	const char* matrixMulKernelModular8DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMulKernelModular8DP(__global double* C, __global double* A, __global double* B,\n"
		"		const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Csub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Csub += As[ty][0] * Bs[0][tx];\n"
		"		Csub += As[ty][1] * Bs[1][tx];\n"
		"		Csub += As[ty][2] * Bs[2][tx];\n"
		"		Csub += As[ty][3] * Bs[3][tx];\n"
		"		Csub += As[ty][4] * Bs[4][tx];\n"
		"		Csub += As[ty][5] * Bs[5][tx];\n"
		"		Csub += As[ty][6] * Bs[6][tx];\n"
		"		Csub += As[ty][7] * Bs[7][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		Csub += As[ty][8] * Bs[8][tx];\n"
		"		Csub += As[ty][9] * Bs[9][tx];\n"
		"		Csub += As[ty][10] * Bs[10][tx];\n"
		"		Csub += As[ty][11] * Bs[11][tx];\n"
		"		Csub += As[ty][12] * Bs[12][tx];\n"
		"		Csub += As[ty][13] * Bs[13][tx];\n"
		"		Csub += As[ty][14] * Bs[14][tx];\n"
		"		Csub += As[ty][15] * Bs[15][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	C[c + ty * widthB + tx] = Csub;\n"
		"}\n"
	};

	const char* matrixMulKernelModular16SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMulKernelModular16SP(__global float* C, __global float* A, __global float* B,\n"
		"		const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Csub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Csub += As[ty][0] * Bs[0][tx];\n"
		"		Csub += As[ty][1] * Bs[1][tx];\n"
		"		Csub += As[ty][2] * Bs[2][tx];\n"
		"		Csub += As[ty][3] * Bs[3][tx];\n"
		"		Csub += As[ty][4] * Bs[4][tx];\n"
		"		Csub += As[ty][5] * Bs[5][tx];\n"
		"		Csub += As[ty][6] * Bs[6][tx];\n"
		"		Csub += As[ty][7] * Bs[7][tx];\n"
		"		Csub += As[ty][8] * Bs[8][tx];\n"
		"		Csub += As[ty][9] * Bs[9][tx];\n"
		"		Csub += As[ty][10] * Bs[10][tx];\n"
		"		Csub += As[ty][11] * Bs[11][tx];\n"
		"		Csub += As[ty][12] * Bs[12][tx];\n"
		"		Csub += As[ty][13] * Bs[13][tx];\n"
		"		Csub += As[ty][14] * Bs[14][tx];\n"
		"		Csub += As[ty][15] * Bs[15][tx];\n"
		"		Csub = fmod(Csub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	C[c + ty * widthB + tx] = Csub;\n"
		"}\n"
	};

	const char* matrixMulKernelModular32DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMulKernelModular32DP(__global double* C, __global double* A, __global double* B,\n"
		"		const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Csub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Csub += As[ty][0] * Bs[0][tx];\n"
		"		Csub += As[ty][1] * Bs[1][tx];\n"
		"		Csub += As[ty][2] * Bs[2][tx];\n"
		"		Csub += As[ty][3] * Bs[3][tx];\n"
		"		Csub += As[ty][4] * Bs[4][tx];\n"
		"		Csub += As[ty][5] * Bs[5][tx];\n"
		"		Csub += As[ty][6] * Bs[6][tx];\n"
		"		Csub += As[ty][7] * Bs[7][tx];\n"
		"		Csub += As[ty][8] * Bs[8][tx];\n"
		"		Csub += As[ty][9] * Bs[9][tx];\n"
		"		Csub += As[ty][10] * Bs[10][tx];\n"
		"		Csub += As[ty][11] * Bs[11][tx];\n"
		"		Csub += As[ty][12] * Bs[12][tx];\n"
		"		Csub += As[ty][13] * Bs[13][tx];\n"
		"		Csub += As[ty][14] * Bs[14][tx];\n"
		"		Csub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 2){\n"
		"			Csub = fmod(Csub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Csub = fmod(Csub, mod);\n"
		"	int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	C[c + ty * widthB + tx] = Csub;\n"
		"}\n"
	};

	const char* matrixMulKernelModular32SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMulKernelModular32SP(__global float* C, __global float* A, __global float* B,\n"
		"		const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Csub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Csub += As[ty][0] * Bs[0][tx];\n"
		"		Csub += As[ty][1] * Bs[1][tx];\n"
		"		Csub += As[ty][2] * Bs[2][tx];\n"
		"		Csub += As[ty][3] * Bs[3][tx];\n"
		"		Csub += As[ty][4] * Bs[4][tx];\n"
		"		Csub += As[ty][5] * Bs[5][tx];\n"
		"		Csub += As[ty][6] * Bs[6][tx];\n"
		"		Csub += As[ty][7] * Bs[7][tx];\n"
		"		Csub += As[ty][8] * Bs[8][tx];\n"
		"		Csub += As[ty][9] * Bs[9][tx];\n"
		"		Csub += As[ty][10] * Bs[10][tx];\n"
		"		Csub += As[ty][11] * Bs[11][tx];\n"
		"		Csub += As[ty][12] * Bs[12][tx];\n"
		"		Csub += As[ty][13] * Bs[13][tx];\n"
		"		Csub += As[ty][14] * Bs[14][tx];\n"
		"		Csub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 2){\n"
		"			Csub = fmod(Csub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Csub = fmod(Csub, mod);\n"
		"	int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	C[c + ty * widthB + tx] = Csub;\n"
		"}\n"
	};

	const char* matrixMulKernelModular1024DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMulKernelModular1024DP(__global double* C, __global double* A, __global double* B,\n"
		"		const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Csub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Csub += As[ty][0] * Bs[0][tx];\n"
		"		Csub += As[ty][1] * Bs[1][tx];\n"
		"		Csub += As[ty][2] * Bs[2][tx];\n"
		"		Csub += As[ty][3] * Bs[3][tx];\n"
		"		Csub += As[ty][4] * Bs[4][tx];\n"
		"		Csub += As[ty][5] * Bs[5][tx];\n"
		"		Csub += As[ty][6] * Bs[6][tx];\n"
		"		Csub += As[ty][7] * Bs[7][tx];\n"
		"		Csub += As[ty][8] * Bs[8][tx];\n"
		"		Csub += As[ty][9] * Bs[9][tx];\n"
		"		Csub += As[ty][10] * Bs[10][tx];\n"
		"		Csub += As[ty][11] * Bs[11][tx];\n"
		"		Csub += As[ty][12] * Bs[12][tx];\n"
		"		Csub += As[ty][13] * Bs[13][tx];\n"
		"		Csub += As[ty][14] * Bs[14][tx];\n"
		"		Csub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 64){\n"
		"			Csub = fmod(Csub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Csub = fmod(Csub, mod);\n"
		"	int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	C[c + ty * widthB + tx] = Csub;\n"
		"}\n"
	};

	const char* matrixMulKernelModular1024SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMulKernelModular1024SP(__global float* C, __global float* A, __global float* B,\n"
		"		const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Csub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Csub += As[ty][0] * Bs[0][tx];\n"
		"		Csub += As[ty][1] * Bs[1][tx];\n"
		"		Csub += As[ty][2] * Bs[2][tx];\n"
		"		Csub += As[ty][3] * Bs[3][tx];\n"
		"		Csub += As[ty][4] * Bs[4][tx];\n"
		"		Csub += As[ty][5] * Bs[5][tx];\n"
		"		Csub += As[ty][6] * Bs[6][tx];\n"
		"		Csub += As[ty][7] * Bs[7][tx];\n"
		"		Csub += As[ty][8] * Bs[8][tx];\n"
		"		Csub += As[ty][9] * Bs[9][tx];\n"
		"		Csub += As[ty][10] * Bs[10][tx];\n"
		"		Csub += As[ty][11] * Bs[11][tx];\n"
		"		Csub += As[ty][12] * Bs[12][tx];\n"
		"		Csub += As[ty][13] * Bs[13][tx];\n"
		"		Csub += As[ty][14] * Bs[14][tx];\n"
		"		Csub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 64){\n"
		"			Csub = fmod(Csub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Csub = fmod(Csub, mod);\n"
		"	int c = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	C[c + ty * widthB + tx] = Csub;\n"
		"}\n"
	};

	const char* matrixMuladdKernelModular1DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMuladdKernelModular1DP(__global double* D, double alpha, __global double* A, __global double* B,\n"
		"		double beta, __global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	Dsub = alpha * Dsub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	if(Dsub < 0){\n"
		"		Dsub = mod + Dsub;\n"
		"	}\n"
		"	double Csub = C[d + ty * widthB + tx];\n"
		"	Csub = beta * Csub;\n"
		"	Csub = fmod(Csub, mod);\n"
		"	if(Csub < 0){\n"
		"		Csub = mod + Csub;\n"
		"	}\n"
		"	Dsub = Dsub + Csub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMuladdKernelModular1SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMuladdKernelModular1SP(__global float* D, float alpha, __global float* A, __global float* B,\n"
		"		float beta, __global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	Dsub = alpha * Dsub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	if(Dsub < 0){\n"
		"		Dsub = mod + Dsub;\n"
		"	}\n"
		"	float Csub = C[d + ty * widthB + tx];\n"
		"	Csub = beta * Csub;\n"
		"	Csub = fmod(Csub, mod);\n"
		"	if(Csub < 0){\n"
		"		Csub = mod + Csub;\n"
		"	}\n"
		"	Dsub = Dsub + Csub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMuladdKernelModular8DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMuladdKernelModular8DP(__global double* D, double alpha, __global double* A, __global double* B,\n"
		"		double beta, __global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	Dsub = alpha * Dsub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	if(Dsub < 0){\n"
		"		Dsub = mod + Dsub;\n"
		"	}\n"
		"	double Csub = C[d + ty * widthB + tx];\n"
		"	Csub = beta * Csub;\n"
		"	Csub = fmod(Csub, mod);\n"
		"	if(Csub < 0){\n"
		"		Csub = mod + Csub;\n"
		"	}\n"
		"	Dsub = Dsub + Csub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMuladdKernelModular16SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMuladdKernelModular16SP(__global float* D, float alpha, __global float* A, __global float* B,\n"
		"		float beta, __global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	int m = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	Dsub = alpha * Dsub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	if(Dsub < 0){\n"
		"		Dsub = mod + Dsub;\n"
		"	}\n"
		"	float Csub = C[d + ty * widthB + tx];\n"
		"	Csub = beta * Csub;\n"
		"	Csub = fmod(Csub, mod);\n"
		"	if(Csub < 0){\n"
		"		Csub = mod + Csub;\n"
		"	}\n"
		"	Dsub = Dsub + Csub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMuladdKernelModular32DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMuladdKernelModular32DP(__global double* D, double alpha, __global double* A, __global double* B,\n"
		"		double beta, __global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 2){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	Dsub = alpha * Dsub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	if(Dsub < 0){\n"
		"		Dsub = mod + Dsub;\n"
		"	}\n"
		"	double Csub = C[d + ty * widthB + tx];\n"
		"	Csub = beta * Csub;\n"
		"	Csub = fmod(Csub, mod);\n"
		"	if(Csub < 0){\n"
		"		Csub = mod + Csub;\n"
		"	}\n"
		"	Dsub = Dsub + Csub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMuladdKernelModular32SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMuladdKernelModular32SP(__global float* D, float alpha, __global float* A, __global float* B,\n"
		"		float beta, __global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 2){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	Dsub = alpha * Dsub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	if(Dsub < 0){\n"
		"		Dsub = mod + Dsub;\n"
		"	}\n"
		"	float Csub = C[d + ty * widthB + tx];\n"
		"	Csub = beta * Csub;\n"
		"	Csub = fmod(Csub, mod);\n"
		"	if(Csub < 0){\n"
		"		Csub = mod + Csub;\n"
		"	}\n"
		"	Dsub = Dsub + Csub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMuladdKernelModular1024DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMuladdKernelModular1024DP(__global double* D, double alpha, __global double* A, __global double* B,\n"
		"		double beta, __global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 64){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	Dsub = alpha * Dsub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	if(Dsub < 0){\n"
		"		Dsub = mod + Dsub;\n"
		"	}\n"
		"	double Csub = C[d + ty * widthB + tx];\n"
		"	Csub = beta * Csub;\n"
		"	Csub = fmod(Csub, mod);\n"
		"	if(Csub < 0){\n"
		"		Csub = mod + Csub;\n"
		"	}\n"
		"	Dsub = Dsub + Csub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMuladdKernelModular1024SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMuladdKernelModular1024SP(__global float* D, float alpha, __global float* A, __global float* B,\n"
		"		float beta, __global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin = widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 64){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	Dsub = alpha * Dsub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	if(Dsub < 0){\n"
		"		Dsub = mod + Dsub;\n"
		"	}\n"
		"	float Csub = C[d + ty * widthB + tx];\n"
		"	Csub = beta * Csub;\n"
		"	Csub = fmod(Csub, mod);\n"
		"	if(Csub < 0){\n"
		"		Csub = mod + Csub;\n"
		"	}\n"
		"	Dsub = Dsub + Csub;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxpyKernelModular1DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixAxpyKernelModular1DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub + c;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxpyKernelModular1SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixAxpyKernelModular1SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub + c;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxpyKernelModular8DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixAxpyKernelModular8DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub + c;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxpyKernelModular16SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixAxpyKernelModular16SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub + c;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxpyKernelModular32DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixAxpyKernelModular32DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 2){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub + c;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxpyKernelModular32SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixAxpyKernelModular32SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 2){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub + c;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxpyKernelModular1024DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixAxpyKernelModular1024DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 64){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub + c;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxpyKernelModular1024SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixAxpyKernelModular1024SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 64){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub + c;\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMaxpyKernelModular1DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMaxpyKernelModular1DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = c - Dsub;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMaxpyKernelModular1SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMaxpyKernelModular1SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = c - Dsub;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMaxpyKernelModular8DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMaxpyKernelModular8DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = c - Dsub;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMaxpyKernelModular16SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMaxpyKernelModular16SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = c - Dsub;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMaxpyKernelModular32DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMaxpyKernelModular32DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 2){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = c - Dsub;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMaxpyKernelModular32SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMaxpyKernelModular32SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 2){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = c - Dsub;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMaxpyKernelModular1024DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixMaxpyKernelModular1024DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 64){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = c - Dsub;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixMaxpyKernelModular1024SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixMaxpyKernelModular1024SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 64){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = c - Dsub;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxmyKernelModular1DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixAxmyKernelModular1DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub - c;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxmyKernelModular1SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixAxmyKernelModular1SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub - c;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxmyKernelModular8DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixAxmyKernelModular8DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub - c;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxmyKernelModular16SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixAxmyKernelModular16SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		Dsub = fmod(Dsub, mod);\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub - c;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxmyKernelModular32DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixAxmyKernelModular32DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 2){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub - c;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxmyKernelModular32SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixAxmyKernelModular32SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 2){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub - c;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxmyKernelModular1024DP = {
		"#define BLOCK_SIZE 16\n"
		"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
		"__kernel void matrixAxmyKernelModular1024DP(__global double* D, __global double* A, __global double* B,\n"
		"		__global double* C, const int widthA, const int widthB, const double mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local double As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local double Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	double Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 64){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	double c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub - c;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

	const char* matrixAxmyKernelModular1024SP = {
		"#define BLOCK_SIZE 16\n"
		"__kernel void matrixAxmyKernelModular1024SP(__global float* D, __global float* A, __global float* B,\n"
		"		__global float* C, const int widthA, const int widthB, const float mod){\n"
		"	int bx = get_group_id(0);\n"
		"	int by = get_group_id(1);\n"
		"	int tx = get_local_id(0);\n"
		"	int ty = get_local_id(1);\n"
		"	int aBegin= widthA * BLOCK_SIZE * by;\n"
		"	int aEnd = aBegin + widthA - 1;\n"
		"	int aStep = BLOCK_SIZE;\n"
		"	int bBegin = BLOCK_SIZE * bx;\n"
		"	int bStep = BLOCK_SIZE * widthB;\n"
		"	__local float As[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	__local float Bs[BLOCK_SIZE][BLOCK_SIZE];\n"
		"	float Dsub = 0;\n"
		"	int mCount = 0;\n"
		"	for(int a = aBegin, b = bBegin; a < aEnd; a += aStep, b += bStep){\n"
		"		As[ty][tx] = A[a + widthA * ty + tx];\n"
		"		Bs[ty][tx] = B[b + widthB * ty + tx];\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"		Dsub += As[ty][0] * Bs[0][tx];\n"
		"		Dsub += As[ty][1] * Bs[1][tx];\n"
		"		Dsub += As[ty][2] * Bs[2][tx];\n"
		"		Dsub += As[ty][3] * Bs[3][tx];\n"
		"		Dsub += As[ty][4] * Bs[4][tx];\n"
		"		Dsub += As[ty][5] * Bs[5][tx];\n"
		"		Dsub += As[ty][6] * Bs[6][tx];\n"
		"		Dsub += As[ty][7] * Bs[7][tx];\n"
		"		Dsub += As[ty][8] * Bs[8][tx];\n"
		"		Dsub += As[ty][9] * Bs[9][tx];\n"
		"		Dsub += As[ty][10] * Bs[10][tx];\n"
		"		Dsub += As[ty][11] * Bs[11][tx];\n"
		"		Dsub += As[ty][12] * Bs[12][tx];\n"
		"		Dsub += As[ty][13] * Bs[13][tx];\n"
		"		Dsub += As[ty][14] * Bs[14][tx];\n"
		"		Dsub += As[ty][15] * Bs[15][tx];\n"
		"		mCount++;\n"
		"		if(mCount == 64){\n"
		"			Dsub = fmod(Dsub, mod);\n"
		"			mCount = 0;\n"
		"		}\n"
		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
		"	}\n"
		"	Dsub = fmod(Dsub, mod);\n"
		"	int d = widthB * BLOCK_SIZE * by + BLOCK_SIZE * bx;\n"
		"	float c = C[d + ty * widthB + tx];\n"
		"	Dsub = Dsub - c;\n"
		"	Dsub = fmod((mod + Dsub), mod);\n"
		"	D[d + ty * widthB + tx] = Dsub;\n"
		"}\n"
	};

} // end of namespace LinBox

#endif // __LINBOX_opencl_matrix_domain_kernels_INL

// Local Variables:
// mode: C++
// tab-width: 4
// indent-tabs-mode: nil
// c-basic-offset: 4
// End:
// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
